It seems like PERMANOVA is pretty good at doing discriminant analyses. When is PLS-DA better, if ever?
Try modifying function to allow for different variances between groups. PERMANOVA is not robust to differences in variances between groups. See how PLS-DA does. Looks like both methods are sensitive to departures from homogeniety of variances.
Test PLS-DA vs. PERMANOVA with the discriminating variables co-varying and not co-varying. PERMANOVA shoudl be about the same, but PLS-DA should be different because it accounts for co-varyation, right? Check R^2X values
Do the above test with a different method of generating discriminating variables.
Look up DDA post-hoc test for permanova.
*Do same exact test but with chemhelper::sim_multvar to figure out what is wrong with it.
# Required Packages
library(MASS)
library(tidyverse)
library(ropls)
library(chemhelper)
#chemhelper contains custom functions for interfacing with ropls package in a friendlier way (and other things).
#Install with devtools::install_github("Aariq/chemhelper")
library(cowplot) #for making and saving prettier plots
library(broom) #for tidy(), which turns model output into dataframes
library(vegan) #for PERMANOVA
This code chunk creates the function sim.vcov() which generates a dataset where we can tune signal, noise, discriminating variables, strength of signal, and strength of discriminating variables. I’m using the notation n for number of observations and p for number of variables. sim.vcov() requires the following arguments:
p_noise: how many variables with covariance = 0.01?p_signal: how many variables with some covariance (covariance = cov_signal)?p_disc: how many discriminating variables?cov_signal: the covariance for the variables with non-zero covariance.diff_disc: the difference in means for the discriminating variables between the two groups.N: how many observations?seed: passed to set.seed()# p_noise = 0
# p_signal = 5
# p_disc = 15
# cov_signal = 0.5
# diff_disc = 1
# cov_disc = 0.5
# group_vars = c(1,2)
# N = 20
# seed = NA
sim.vcov <- function(p_noise,
p_signal,
p_disc,
cov_signal,
cov_disc,
diff_disc,
group_vars = c(1, 1),
N,
seed = NA){
# Choose a random number for seed if none is supplied
if(is.na(seed)){
seed = as.integer(runif(1) * 10e6)
}
# make noise data
if(p_noise > 0){
S_noise <- matrix(rep(0.001, p_noise^2), ncol = p_noise)
diag(S_noise) <- 1
set.seed(seed)
data_noise <- mvrnorm(n = N, Sigma = S_noise, mu = rep(0, p_noise)) %>%
as.data.frame()
colnames(data_noise) <- paste0("noise_", 1:p_noise)
} else {data_noise <- list()}
# make signal data
if(p_signal > 0){
S_signal <- matrix(rep(cov_signal, p_signal^2), ncol = p_signal)
S_signal_B <- S_signal_A <- S_signal
diag(S_signal_A) <- group_vars[1]
diag(S_signal_B) <- group_vars[2]
set.seed(seed+1)
signal_A <- mvrnorm(n = N/2, Sigma = S_signal_A, mu = rep(0, p_signal)) %>%
as.data.frame()
signal_B <- mvrnorm(n = N/2, Sigma = S_signal_B, mu = rep(0, p_signal)) %>%
as.data.frame()
data_signal <- bind_rows(signal_A, signal_B)
colnames(data_signal) <- paste0("signal_", 1:p_signal)
} else {data_signal <- list()}
# make discriminating data
if(p_disc > 0){
S_disc <- matrix(rep(cov_disc, p_disc^2), ncol = p_disc)
# make data for group A
diag(S_disc) <- group_vars[1]
set.seed(seed+2)
means_A <- rep((diff_disc/2), p_disc)
data_A <- mvrnorm(n = N/2, mu = means_A, Sigma = S_disc) %>%
as.data.frame()
# make data for group B
diag(S_disc) <- group_vars[2]
set.seed(seed+3)
means_B <- rep(-1*(diff_disc/2), p_disc)
data_B <- mvrnorm(n = N/2, mu = means_B, Sigma = S_disc) %>%
as.data.frame()
data_disc <- bind_rows(data_A, data_B)
colnames(data_disc) <- paste0("disc_", 1:p_disc)
} else{data_disc <- list()}
# column bind it all together
data1 <- bind_cols(data_disc, data_signal, data_noise) %>%
add_column(group = rep(c("a","b"), each = N/2), .before = 1)
return(data1)
}
First, make a bunch of datasets with the same parameters
nperm = 50 #how many perumutations
#generate data
data1.list <- map(1:nperm,
~sim.vcov(p_noise = 10,
p_signal = 25,
p_disc = 5,
cov_signal = 0.8,
diff_disc = 1.2,
cov_disc = 0.2,
N = 30,
seed = NA))
# data1.list[[1]]
Do PERMANOVA on all of them
PERMANOVAresults1 <- map_dbl(data1.list,
~ adonis(select(., -group) ~ .$group, method = "eu")$aov.tab$`Pr(>F)`[1])
Now do PLS-DA on all of them and get p-values (this will take a long time)
And extract p-values
PLSresults1 <- plsda1.list %>% map_df(~get_plotdata(.)$model_stats)
# PLSresults1
comparison1 <- bind_cols("permanova"= PERMANOVAresults1, PLSresults1)
p1.permanova <- ggplot(comparison1) +
geom_density(aes(permanova), fill = "green", alpha = 0.33) +
labs(x = "p value") +
geom_vline(xintercept = 0.05, linetype = 5) +
ggtitle("PERMANOVA") +
xlim(0, 0.6)
# p1.permanova
p1.plsda <- ggplot(comparison1) +
geom_density(aes(pQ2), fill = "red", alpha = 0.33) +
labs(x = "p value") +
geom_vline(xintercept = 0.05, linetype = 5) +
ggtitle("PLS-DA")
# p1.plsda
p1 <- plot_grid(p1.permanova, p1.plsda)
First, make a bunch of datasets with the same parameters
# nperm = 100 #how many perumutations
#generate data
data2.list <- map(1:nperm,
~sim.vcov(p_noise = 10,
p_signal = 25,
p_disc = 5,
cov_signal = 0.8,
diff_disc = 1.2,
cov_disc = 0.6,
N = 30,
seed = NA))
# data2.list[[1]]
Do PERMANOVA on all of them
PERMANOVAresults2 <- map_dbl(data2.list,
~ adonis(select(., -group) ~ .$group, method = "eu")$aov.tab$`Pr(>F)`[1])
Now do PLS-DA on all of them and get p-values (this will take a long time)
And extract p-values
PLSresults2 <- plsda2.list %>% map_df(~get_plotdata(.)$model_stats)
# PLSresults2
comparison2 <- bind_cols("permanova"= PERMANOVAresults2, PLSresults2)
p2.permanova <- ggplot(comparison2) +
geom_density(aes(permanova), fill = "green", alpha = 0.33) +
labs(x = "p value") +
geom_vline(xintercept = 0.05, linetype = 5) +
ggtitle("PERMANOVA")
# p2.permanova
p2.plsda <- ggplot(comparison2) +
geom_density(aes(pQ2), fill = "red", alpha = 0.33) +
labs(x = "p value") +
geom_vline(xintercept = 0.05, linetype = 5) +
ggtitle("PLS-DA")
# p2.plsda
p2 <- plot_grid(p2.permanova, p2.plsda)
scenario1 <- plot_grid(p1, p2, ncol = 1, labels = c("cov = 0.2", "cov = 0.6"))
scenario1
PLS-DA performs better than PERMANOVA when there are many co-varying variables not related to response variable. This make sense because it is analagous to ANOVA but uses covariances. So the ratio of the within group covariance to the between group covariance shrinks when there are a bunch of highly correlated variables not related to group membership, yeah?
This was an artifact created by the same seed being used for multiple pieces of the dataset. That made them correlated when they shouldn’t be.
sim.vcov(p_noise = 10,
p_signal = 25,
p_disc = 5,
cov_signal = 0.8,
diff_disc = 1.2,
cov_disc = 0.6,
N = 30,
seed = NA))
comparison1 %>%
summarise_all(mean) %>% bind_rows(
comparison2 %>%
summarise_all(mean)
) %>%
add_column(cov = c("0.2", "0.7") , .before = 1)
test <- sim.vcov(p_noise = 20,
p_signal = 10,
p_disc = 10,
cov_signal = 0.8,
diff_disc = 1.2,
cov_disc = 0.5,
N = 30,
seed = 100)
# ggplot(test, aes(x = disc_4, y = disc_5, color = group)) +
# geom_point()
library(iheatmapr)
test %>% select(-group) %>% cor() %>% iheatmap(row_labels = TRUE, col_labels = TRUE)